#############################################################
## SPPQ Anniversary Issue:                                 ##
## SPPQ and the Development of the State Politics Subfield ##
#############################################################
#R Studio: Version 2023.06.2+561 (2023.06.2+561)

#Load packages
P<-c("dplyr", "tidyr", "tibble", "purrr", "ggplot2", "stringr", "quanteda")

for (i in 1:length(P)) {
  ifelse(!require(P[i],character.only=TRUE),install.packages(P[i]),
         print(":)"))
  library(P[i],character.only=TRUE)
}

rm(P)
rm(i)

#Read in data
fullData <- read.csv("FourJournals_FINAL.csv") #obs: 10924, variables: 36
sppc <- read.csv("SPPC_FINAL.csv") #obs: 1781, variables: 35

#Subsetting to SPPQ articles
sppq <- subset(fullData, journal=="sppq")

#Subsetting to Top Three articles and Top Three state politics articles
top3all <- subset(fullData, journal!="sppq") #Top Three articles
top3 <- subset(top3all, statearticle==1) #Top Three state politics articles

##########################################################################
## Percent of Articles assigned to topics, Footnote 7 and stats, pg. 18 ##
##########################################################################
round(prop.table(table(top3$numbercodes)),2)
round(prop.table(table(sppq$numbercodes)),2)
round(prop.table(table(sppc$numbercodes)),2)

################################################################################################
## Table 1: State Politics Articles in the Top Three Journals that Study One State or Region  ##
################################################################################################
table(top3$decade)
round(prop.table(table(top3$decade,top3$mentionState),1),2)
round(prop.table(table(top3$decade,top3$mentionRegion),1),2)
round(prop.table(table(top3$decade,top3$mentionStateOrRegion),1),2)

#######################################################################################
## Table 2: State Politics Articles int he Top Three Journals, by Journal and Decade ##
#######################################################################################
#State Politics Articles
table(top3$journal,top3$decade)
colSums(table(top3$journal,top3$decade))

#All Articles
round(table(top3$journal,top3$decade)/table(top3all$journal,top3all$decade),2)
round(colSums(table(top3$journal,top3$decade))/colSums(table(top3all$journal,top3all$decade)),2)

#############################################################################################################
## Table 3: Percentage of state politics articles addressing each topic in Top-3 journals, by time period  ##
#############################################################################################################
# List of variables and topic labels
topics <- tribble(
  ~Topic,                        ~Variable,
  "Bureaucratic Politics",       "bureaucratic",
  "Criminal Justice Policy",     "crimj",
  "Direct Democracy",            "directdem",
  "Economic and Fiscal Policy",  "econ",
  "Education Policy",            "educ",
  "Elections",                   "elections",
  "Environmental Policy",        "environ",
  "Gender",                      "gender",
  "Gubernatorial Politics",      "govs",
  "Health and Welfare Policy",   "health",
  "Interest Groups",             "ig",
  "Intergovernmental Relations", "federalism",
  "Judicial Politics",           "judicial",
  "Legislative Politics",        "leg",
  "Methods",                     "methods",
  "Morality Policy",             "morality",
  "Parties",                     "parties",
  "Policy (General)",            "policy",
  "Public Opinion",              "pubop",
  "Race and Ethnic Politics",    "race"
)

#Function to calculate proportions for each dataset
calc_props <- function(data, dataset_name) {
  data %>%
    summarise(across(all_of(topics$Variable), ~mean(.x, na.rm = TRUE))) %>%
    pivot_longer(everything(), names_to = "Variable", values_to = "Proportion") %>%
    left_join(topics, by = "Variable") %>%
    mutate(Dataset = dataset_name)
}

#Apply to both datasets
df_sppq <- calc_props(sppq, "SPPQ")
df_sppc <- calc_props(sppc, "SPPC")
df_top3 <- calc_props(top3, "Top-3")

#Combine
plot_data <- bind_rows(df_sppq, df_sppc, df_top3)
print(plot_data,n=1000)

three_periods <- function(y) {
  case_when(
    y >= 1960 & y <= 1985 ~ "1960–1985",
    y >= 1986 & y <= 2000 ~ "1986–2000",
    y >= 2001 & y <= 2025 ~ "2001–2025",
    TRUE ~ NA_character_
  )
}

#Pool datasets
pooled <- bind_rows(
  sppq %>% mutate(Dataset = "SPPQ"),
  sppc %>% mutate(Dataset = "SPPC"),
  top3 %>% mutate(Dataset = "Top-3")
) %>%
  filter(!is.na(year)) %>%
  mutate(period = three_periods(year)) %>%
  filter(!is.na(period))

#Function to compute percentages and p-value across 3 periods
topic_test <- function(varname) {
  fml <- as.formula(paste(varname, "~ period"))
  test <- oneway.test(fml, data = pooled)
  pooled %>%
    group_by(period) %>%
    summarise(prop = mean(.data[[varname]], na.rm = TRUE), .groups = "drop") %>%
    pivot_wider(names_from = period, values_from = prop) %>%
    mutate(
      Variable = varname,
      p_value = test$p.value)
}

#Apply to all topics
results_three <- map_dfr(topics$Variable, topic_test) %>%
  left_join(topics, by = "Variable") %>%
  select(Topic, `1960–1985`, `1986–2000`, `2001–2025`, p_value) %>%
  mutate(across(-c(Topic, p_value), ~ round(.x * 100, 2)), 
         p_value = round(p_value, 2))

#####################################################################################################
## TABLE 4: Difference in the Proportion of Articles, By Topic, from 2001 to 2012 and 2013 to 2025 ##
#####################################################################################################
#Define periods
half_period <- function(y) {
  case_when(
    y >= 2001 & y <= 2012 ~ "2001–2012",
    y >= 2013 & y <= 2025 ~ "2013–2025",
    TRUE ~ NA_character_)
}

#Function to compute difference & p-value for one dataset
dataset_diff_p <- function(data, dataset_name) {
  data_pp <- data %>%
    filter(!is.na(year)) %>%
    mutate(period = half_period(year)) %>%
    filter(!is.na(period))
  
  topic_test <- function(varname) {
    fml <- as.formula(paste(varname, "~ period"))
    test <- t.test(fml, data = data_pp)   # t-test on binary
    
    summary <- data_pp %>%
      group_by(period) %>%
      summarise(prop = mean(.data[[varname]], na.rm = TRUE), .groups = "drop") %>%
      pivot_wider(names_from = period, values_from = prop)
    
    summary %>%
      mutate(
        Topic = topics$Topic[topics$Variable == varname],
        Difference = (`2013–2025` - `2001–2012`) * 100,
        p_value = test$p.value
      ) %>%
      select(Topic, Difference, p_value)
  }
  
  map_dfr(topics$Variable, topic_test) %>%
    rename_with(~ paste0(dataset_name, "_", .), -Topic)
}

#Apply to each dataset
sppq_res <- dataset_diff_p(sppq, "SPPQ")
sppc_res <- dataset_diff_p(sppc, "SPPC")
top3_res <- dataset_diff_p(top3, "Top3")

#Merge into one table
final_results <- sppq_res %>%
  left_join(sppc_res, by = "Topic") %>%
  left_join(top3_res, by = "Topic") %>%
  mutate(across(-Topic, ~ round(.x, 2)))

#####################################################################################################
## Figure 1: Most Frequent Bigrams in State Politics Articles Published in Top Journals, by Period ##
#####################################################################################################
#Filter to state politics articles
only_state_df <- fullData %>% 
  filter(statearticle==1)

#Separate into time frames, top 3 journals
top_3_period_1 <- only_state_df %>% 
  filter(journal %in% c("ajps", "apsr", "jop") & year<1986)

top_3_period_2 <- only_state_df %>% 
  filter(journal %in% c("ajps", "apsr", "jop") & year>=1986 & year<2001)

top_3_period_3 <- only_state_df %>% 
  filter(journal %in% c("ajps", "apsr", "jop") & year>=2001 & year<=2025)

##Function for bigram analysis
analyze_bigrams <- function(data, period_label, filename, plot_title) {
  spp_corp <- corpus(data, docid_field = "key", text_field = "abstract") #creating corpus
  
  spp_tokens <- spp_corp %>%
    tokens(what = "word",
           remove_punct = TRUE,
           remove_symbols = TRUE,
           remove_numbers = TRUE,
           remove_url = TRUE,
           remove_separators = TRUE,
           include_docvars = FALSE) %>%
    tokens_tolower() %>%
    tokens_remove(stopwords("english")) #tokenizing and processing
  
  tokens_bigrams <- tokens_ngrams(spp_tokens, n = 2) #creating bigrams
  
  dfm_bigrams <- dfm(tokens_bigrams) #creating DFM
  
  docfreq_bigrams <- docfreq(dfm_bigrams)
  docfreq_df <- data.frame(
    feature = names(docfreq_bigrams),
    docfreq = as.numeric(docfreq_bigrams),
    row.names = NULL) #creating data frame for plot
  
  top_bigrams_docfreq <- docfreq_df %>%
    arrange(desc(docfreq)) %>%
    slice(1:15) %>%
    mutate(feature = str_replace_all(feature, "_", " ")) #cutting to top 15
  
  png(filename, width = 1950, height = 1300, res = 300)
  p <- ggplot(top_bigrams_docfreq, aes(x = reorder(feature, docfreq), y = docfreq)) +
    geom_col(fill = "grey50") +
    labs(title = plot_title, x = "", y = "Document Frequency") +
    coord_flip() +
    theme_minimal() +   
    theme(
      plot.title = element_text(hjust = 0.5),
      axis.text.x = element_text(color = "black",size=10),
      axis.text.y = element_text(color = "black",size=10)
    )
  print(p)
  dev.off() #plotting
}

#Plotting
analyze_bigrams(
  top_3_period_1,
  "top3_p1",
  "top3_p1_bigrams_11_11_25.png",
  "15 Most Frequent Bigrams, Top Three Journals \n(1960–1985)")

analyze_bigrams(top_3_period_2,
                "top3_p2",
                "top3_p2_bigrams_11_11_25.png",
                "15 Most Frequent Bigrams, Top Three Journals \n(1986–2000)")

analyze_bigrams(top_3_period_3,
                "top3_p3",
                "top3_p3_bigrams_11_11_25.png",
                "15 Most Frequent Bigrams, Top Three Journals \n(2001–2025)")

##########################################
## FIGURE 2: Count of Articles, by Year ##
##########################################
#Count papers and posters by year for each dataset
counts_sppq <- sppq %>%
  group_by(year, paper) %>%
  summarise(count = n(), .groups = "drop") %>%
  mutate(Dataset = "SPPQ")

counts_sppc <- sppc %>%
  group_by(year, paper) %>%
  summarise(count = n(), .groups = "drop") %>%
  mutate(Dataset = "SPPC")

#Combine
counts <- bind_rows(counts_sppq, counts_sppc) %>%
  mutate(
    paper = factor(paper, labels = c("Poster", "Paper")),
    Dataset = factor(Dataset, levels = c("SPPQ", "SPPC")))

#Plot with stacked bars
png("article_count.png", width = 1950, height = 1300, res = 300)
ggplot(counts, aes(x = year, y = count, fill = paper)) +
  geom_col() +
  scale_fill_manual(values = c("gray60", "gray20")) +
  facet_wrap(~Dataset, ncol = 1, scales = "free_y") +
  labs(x = "Year", y = "Number of Articles",
       fill = "Type",
       title = "Article Counts by Year (Papers vs Posters)") +
  theme_minimal() +   theme(
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(color = "black",size=10),
    axis.text.y = element_text(color = "black",size=10)
  )
dev.off()

############################################################
## FIGURE 3: Percentage of Articles Addressing Each Topic ##
############################################################
#List of variables and topic labels
topics <- tribble(
  ~Topic,                        ~Variable,
  "Bureaucratic Politics",       "bureaucratic",
  "Criminal Justice Policy",     "crimj",
  "Direct Democracy",            "directdem",
  "Economic and Fiscal Policy",  "econ",
  "Education Policy",            "educ",
  "Elections",                   "elections",
  "Environmental Policy",        "environ",
  "Gender",                      "gender",
  "Gubernatorial Politics",      "govs",
  "Health and Welfare Policy",   "health",
  "Interest Groups",             "ig",
  "Intergovernmental Relations", "federalism",
  "Judicial Politics",           "judicial",
  "Legislative Politics",        "leg",
  "Methods",                     "methods",
  "Morality Policy",             "morality",
  "Parties",                     "parties",
  "Policy (General)",            "policy",
  "Public Opinion",              "pubop",
  "Race and Ethnic Politics",    "race"
)

#Function to calculate proportions for each dataset
calc_props <- function(data, dataset_name) {
  data %>%
    summarise(across(all_of(topics$Variable), ~mean(.x, na.rm = TRUE))) %>%
    pivot_longer(everything(), names_to = "Variable", values_to = "Proportion") %>%
    left_join(topics, by = "Variable") %>%
    mutate(Dataset = dataset_name)
}

#Apply to both datasets
df_sppq <- calc_props(sppq, "SPPQ")
df_sppc <- calc_props(sppc, "SPPC")
df_top3 <- calc_props(top3, "Top 3")

#Combine
plot_data <- bind_rows(df_sppq, df_sppc, df_top3)
print(plot_data,n=1000)

#Plot
png("topic_prop.png", width = 1950, height = 2300, res = 300)
ggplot(plot_data, aes(x = reorder(Topic, Proportion), y = Proportion, fill = Dataset)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("gray80", "gray50","black")) +
  coord_flip() +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(x = "", y = "Proportion of Papers", title = "Proportion of Papers by Topic") +
  theme_minimal() +   theme(
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(color = "black",size=10),
    axis.text.y = element_text(color = "black",size=10)
  )
dev.off()

#############################################################################################
## Figure 4: Most Frequent Bigrams in State Politics Articles Published in SPPQ, by Period ##
#############################################################################################
#Separate into time frames, SPPQ articles
sppq_period_1 <- only_state_df %>% 
  filter(journal %in% c("sppq") & year>=2001 & year<=2012)

sppq_period_2 <- only_state_df %>% 
  filter(journal %in% c("sppq") & year>2013 & year<=2025)

#Plot using same function from Figure 1
analyze_bigrams(sppq_period_1,
                "sppq_p1",
                "sppq_p1_bigrams_11_11_25.png",
                "15 Most Frequent Bigrams, State Politics and \nPolicymaking Quarterly (2001–2012)")

analyze_bigrams(sppq_period_2,
                "sppq_p2",
                "sppq_p2_bigrams_11_11_25.png",
                "15 Most Frequent Bigrams, State Politics and \nPolicymaking Quarterly (2013–2025)")

################################################
## FIGURE 5: Trends in Coauthorship Over Time ##
################################################
# Add dataset labels and combine
sppq_auth <- sppq %>% mutate(Dataset = "SPPQ")
sppc_auth <- sppc %>% mutate(Dataset = "SPPC")
sppc_auth <- subset(sppc_auth,paper==1)
top3_auth <- top3 %>% mutate(Dataset = "Top 3")

authors_data <- bind_rows(sppq_auth, sppc_auth, top3_auth) %>%
  filter(!is.na(year), !is.na(numauthors))

# Compute proportion of papers with >1 author by year
prop_multi <- authors_data %>%
  group_by(Dataset, year) %>%
  summarise(prop_multi = mean(numauthors > 1, na.rm = TRUE),
            n = n(),
            .groups = "drop")

# Plot with loess smoother, line types, and 50% ref line
png("coauthor.png", width = 1950, height = 1300, res = 300)
ggplot(prop_multi, aes(x = year, y = prop_multi, linetype = Dataset, color = Dataset)) +
  #geom_point(size = 2, alpha = 0.7) +
  geom_smooth(method = "loess", se = FALSE, linewidth = 1) +
  scale_color_manual(values = c("gray80", "gray50","black")) +
  geom_hline(yintercept = 0.5, linetype = "solid", color = "black", linewidth = 0.6) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), limits = c(.0,1)) +
  labs(
    x = "Year",
    y = "Proportion Multi-Authored",
    title = "Proportion of Multi-Authored Papers, by Year",
    linetype = "Dataset",
    color = "Dataset"
  ) +
  theme_minimal() +   theme(
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(color = "black",size=10),
    axis.text.y = element_text(color = "black",size=10)
  )
dev.off()

